# change [PATH] accordingly to parent directory of files! 
setwd("[PATH]")
rm(list=ls(all=TRUE))

library(magrittr)
library(slam)
library(reshape2)
library(tm)
library(SnowballC)
library(ggplot2)

# directory with resolution texts in plain text format 
txt.dir <- "./RES-txt/"

# create text corpus
txt.corpus <- VCorpus(DirSource(txt.dir, encoding="UTF-8"), 
                      readerControl = list(language = "en", reader = readPlain))

# text transformations
txt.corpus <- tm_map(txt.corpus, content_transformer(tolower)) # all lower case
txt.corpus <- tm_map(txt.corpus, removePunctuation) # remove punctuation
txt.corpus <- tm_map(txt.corpus, removeNumbers) # remove numbers
txt.corpus <- tm_map(txt.corpus, removeWords, stopwords("english")) # remove english stop words
txt.corpus <- tm_map(txt.corpus, stemDocument) # stemming (uses SnowballC)
txt.corpus <- tm_map(txt.corpus, stripWhitespace) # remove double whitespaces

# create a term-document matrix
tdm <- TermDocumentMatrix(txt.corpus) %>% 
as.matrix(.) %>% 
  melt(value.name = "count")

# select PoC terms to plot
plot.data <- tdm[tdm$Terms %in% c("protect", "civilian", "journalist", "women", "violenc", "children", "elder", "personnel", "refuge"),]
plot.data$Docs <- as.numeric(plot.data$Docs)
plot.data$Terms <- factor(plot.data$Terms, levels = rev(c("civilian","protect", "violenc", "women", "children", "refuge", "elder", "personnel", "journalist")))

# create visualization
source("../Visualization_Theme.R")
ggplot(plot.data, aes(x = Docs, y = Terms, fill = log10(count))) +
  theme_MA() +
  geom_tile(colour="white", height=.8, width=1) +
  scale_fill_gradient(name="Term frequency\nper resolution (log)", high="#FF0000" , low="gray", na.value="transparent") +
  scale_x_discrete(breaks=c(79, 133, 276, 462, 647, 1285, 1908), labels=c("1950", "1960", "1970", "1980", "1990", "2000", "2010")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme(axis.text = element_text(size=12)) +
  ylab("") + xlab("")
